clear all
set more off
cap log close

***************************************************************************************************
* Program: 1_process_pings.do
***************************************************************************************************

************************************************************************
** set global macros
************************************************************************
global prison_dir "~/Dropbox/Prison_Covid/"
global homes_folder "/Volumes/Long_Term_Storage/Safegraph_Data_Stata/Home_Files"
global geo7_folder "/Volumes/Long_Term_Storage/Safegraph_Data_Stata/HalfHour_Geo7_Files/"


************************************************************
* STEP 1. preprocess raw ping files 
************************************************************
cap program drop prep_raw_files
program define prep_raw_files
    syntax, yr(integer)
    // determine months based on year
    if `yr' == 2017 {
        local mon "02 03 04 05 06 07 08 09 10 11"
    }
    else if `yr' == 2018 {
        local mon "03 04 05 06 07 08 09 10 11 12"
    }
    else {
        // 2019 and 2020 full year
        local mon "01 02 03 04 05 06 07 08 09 10 11 12"
    }
    local yr_short = substr("`yr'",3,2)
    
    forvalues m of local mon {
        cd "${ping_folder}/"
        if `m' < 10 {
            fs Pings_`yr_short'_0`m'_*.dta
        }
        else {
            fs Pings_`yr_short'_`m'_*.dta
        }
        foreach F in `r(files)' {
            use "`F'", clear
            gen str2 geo2 = substr(geo_hash,1,2)
            keep if inlist(geo2, "9p", "9r", "9n", "9q", "9m")
            drop geo2
            gen str7 geo7 = substr(geo_hash,1,7)
            merge m:1 geo7 using "${prison_dir}/Prison_Boundaries/Prison_Boundaries_CA_geo7_cover_and_nbhors.dta", nogenerate keep(match)
            drop geo7
            capture geoinpoly latitude longitude using "${prison_dir}/Prison_Boundaries/Prison_Boundaries_CA_coor.dta"
            if _rc == 2000 {
                gen byte _ID = .
            }
            * drop if _ID == .
            rename _ID Shapefile_ID
            sort ID_`yr' utc_timestamp
            compress
            save "${prison_dir}/Pings/`yr'/Prison_`F'", replace
        }
    }
end

***************************************************************************************************
* STEP 2. Process ping data to get qualified phones (phones that spend at least 10 minutes a day)
***************************************************************************************************

*****
* program: read ping data
cap program drop read_ping_data
program define read_ping_data

	syntax, yr(integer) // possible values = 17, 18, 19, 20

	set more off
	clear
	
	if `yr' == 20 {
		local mon "01 02 03 04 05 06 07 08 09 10 11 12"  
		display "`mon'"
	}
	if `yr' == 19 {
		local mon "01 02 03 04 05 06 07 08 09 10 11 12"  
		display "`mon'"
	}
	if `yr' == 18 {
		local mon "03 04 05 06 07 08 09 10 11 12"  
		display "`mon'"
	}
	if `yr' == 17 {
		local mon "02 03 04 05 06 07 08 09 10 11"  
		display "`mon'"
	}	
	
	local day = "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"
	foreach m of local mon {
		foreach d of local day { 
			
			capture confirm file "${prison_dir}/Pings/20`yr'/Prison_pings_`yr'_`m'_`d'.dta" 
			
			if _rc == 0 {
				
				append using "${prison_dir}/Pings/20`yr'/Prison_pings_`yr'_`m'_`d'.dta" 
			
			}
			
		}

	}
	
end

*****
* program: assign outside ping based on nearest distance between a ping to polygon's vertices 
* (no year input required)
cap program drop assign_outside_ping
program define assign_outside_ping

	syntax [, cutoff(integer 999999)] // default: 9999.99 km
	gen long baseid = _n 
	
	* match pings that are outside to nearest polygon 
	* calculate distance to nearest vertex 
	geonear baseid latitude longitude using ///
		"${prison_dir}/Yilin_exploratory/Prison_Boundaries_CAMDNCSC_coor_forgeonear.dta", ///
		neighbors(fid _Y _X)

	** drop pings (outside of fenceline) and `cutoff' m away from nearest polygon's vertice
	drop if km_to_nid > `cutoff' * 0.001 & missing(Shapefile_ID)
	
	ren nid fid
	merge m:1 fid using "${prison_dir}/Yilin_exploratory/Prison_Boundaries_CAMDNCSC_coor_forgeonear.dta", keepusing(_ID) keep(master match)
	
	* an approximation to assign pings to facility based on the nearest distance to facility polygon vertice
	* caveat: this is NOT the nearest distance to polygon
	* the approximation could be wrong if multiple facilities are close in proximity
	replace Shapefile_ID = _ID if missing(Shapefile_ID)
	
	drop baseid fid _ID 
	
end 

*****
* program: generate time variables
cap program drop gen_time_vars
program define gen_time_vars

	syntax, yr(integer)  // must pass year for DST logic

	**************************************
	* obtain local Hours 
	* for all three states: UTC_DT_offset = 4
	* for California: offset is 7 hours (PDT) or 8 hours (PST)
	**************************************
	gen UTC_DT_offset = 7 

	// tab  UTC
	gen Hour_UTC = hh((utc_timestamp * 1000) + mdyhms(1,1,1970,0,0,0))
	gen StataDate_UTC = dofc((utc_timestamp * 1000) + mdyhms(1,1,1970,0,0,0))
	gen min = mm((utc_timestamp * 1000) + mdyhms(1,1,1970,0,0,0))

	gen Stata_Date_local = StataDate_UTC 
	replace Stata_Date_local = Stata_Date_local - 1 if Hour_UTC - UTC_DT_offset < 0 

	gen Hour_local = Hour_UTC - UTC_DT_offset 
	replace Hour_local = Hour_local + 24 if Hour_UTC - UTC_DT_offset < 0 

	* adjust for DST - dates differ by year
	* DST in US: 2nd Sunday in March (spring forward) and 1st Sunday in November (fall back)
	if `yr' == 2017 {
		local dst_spring = "3/12/2017"
		local dst_fall = "11/5/2017"
	}
	else if `yr' == 2018 {
		local dst_spring = "3/11/2018"
		local dst_fall = "11/4/2018"
	}
	else if `yr' == 2019 {
		local dst_spring = "3/10/2019"
		local dst_fall = "11/3/2019"
	}
	else if `yr' == 2020 {
		local dst_spring = "3/8/2020"
		local dst_fall = "11/1/2020"
	}
	
	replace Hour_local = Hour_local - 1 if ((Stata_Date_local == date("`dst_spring'","MDY") & Hour_local <= 2) | (Stata_Date_local < date("`dst_spring'","MDY"))) 
	replace Hour_local = Hour_local - 1 if ((Stata_Date_local == date("`dst_fall'","MDY") & Hour_local >= 2) | (Stata_Date_local > date("`dst_fall'","MDY")))  

	replace Stata_Date_local = Stata_Date_local - 1 if Hour_local < 0 
	replace Hour_local = Hour_local + 24 if Hour_local < 0 
	format Stata_Date_local %td
	
	gen week = week(Stata_Date_local)
	gen month = month(Stata_Date_local)	
		
	gen hour_of_day = Hour_local + min / 60 

	drop StataDate_UTC Hour_UTC min UTC_DT_offset

	compress
	
end


*****
program define get_qualified_ids

	syntax, yr(integer)

	**************************************
	* Merge to get facilities characteristics 
	**************************************
	ren Shapefile_ID _ID
	merge m:1 _ID using  "${prison_dir}/Yilin_exploratory/cdcr-population-data-master/data/cdcr_population_w_ShapefileID.dta", ///
		keepusing(NAME ANALYTICSAMPLE) ///
		keep(match) nogenerate

	label define faci_label 1 "State Prison (Incl. Med Facility)" 2 "State Prison (Work)" ///
		3 "Transitional Release Facilities" 4 "Else (Pretrial, combined jail/prison)"
	label values ANALYTICSAMPLE faci_label

	ren _ID Shapefile_ID

	* generate time variables
	gen_time_vars, yr(`yr')

	* How many number of days do we observe phones in a prison? 
	gegen tot_pings = count(utc_timestamp), by(ID_`yr')

	gunique Stata_Date_local, by(ID_`yr') generate(days_nb_faci)
	label variable days_nb_faci "Days ping nearby facility"

	* calculate the duration per day 
	gegen time_in_faci_day = total(Sec_Till_Next_Ping / 60), by(ID_`yr' Stata_Date_local Shapefile_ID)

	egen tag_ID_day_faci = tag(ID_`yr' Stata_Date_local Shapefile_ID) 

	**************************************
	* only keep pings with at least 10 minutes in any of the day in a facility
	**************************************

	keep if time_in_faci_day >= 10 & time_in_faci_day != .

	egen tag_ID = tag(ID_`yr')

	**************************************
	* collapse to ID level  
	**************************************

	keep if tag_ID 
	keep ID_`yr' 

	sort ID_`yr'
	compress
	
end 



******************************************************************************************************************
* STEP 3. Pull all geo7-halfhours for these qualified phones.
******************************************************************************************************************

*****
* program: 
cap program drop pull_geo7_halfhours
program define pull_geo7_halfhours
    syntax, yr(integer)
    if `yr' == 2017 {
        local mon "02 03 04 05 06 07 08 09 10 11"
        local yr_short 17
    }
    else if `yr' == 2018 {
        local mon "03 04 05 06 07 08 09 10 11 12"
        local yr_short 18
    }
    else if `yr' == 2019 {
        local mon "01 02 03 04 05 06 07 08 09 10 11 12"
        local yr_short 19
    }
    else if `yr' == 2020 {
        local mon "01 02 03 04 05 06 07 08 09 10 11 12"
        local yr_short 20
    }
    cd "${geo7_folder}/`yr'"
    foreach m of local mon {
        if `m' < 10 {
            fs Geo7HH_`yr_short'_0`m'_*.dta
        }
        else {
            fs Geo7HH_`yr_short'_`m'_*.dta
        }
        foreach F in `r(files)' {
            use "`F'", clear
            merge m:1 ID_`yr' using "${prison_dir}/Pings/CA_state_prison_phones_10mins_`yr'.dta", nogenerate keep(match)
            sort ID_`yr' StataDate_UTC HourOfDay_UTC SecondHalfHour
            compress
            save "${prison_dir}/Geo7HHs/`yr'/Prison_`F'", replace
        }
    }
end


******************************************************************************************************************
* STEP 4. Get Geohash-7 Half Hours in Prison
******************************************************************************************************************

cap program drop get_geo7hh_in_prisons
program define get_geo7hh_in_prisons
    syntax, yr(integer)
    local yr_short = substr("`yr'",3,2)
    
    cd "/mnt/phd/yzhuo/prison/"
    set more off
    clear
    
    // determine month list based on year rules
    if `yr' == 2017 {
        local mon "02 03 04 05 06 07 08 09 10 11"
    }
    else if `yr' == 2018 {
        local mon "03 04 05 06 07 08 09 10 11 12"
    }
    else {
        local mon "01 02 03 04 05 06 07 08 09 10 11 12"
    }
    local day = "01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31"

    foreach m of local mon{
        foreach d of local day{
            capture confirm file "Geo7HHs/`yr'/Prison_Geo7HH_`yr_short'_`m'_`d'.dta.dta"
            if _rc == 0 {
                append using "Geo7HHs/`yr'/Prison_Geo7HH_`yr_short'_`m'_`d'.dta.dta"
            }
        }
    }

    compress

    ** merge to keep only prison covers 
    ren Geohash_7 geohash_7
    merge m:1 geohash_7 using "data/CA_state_prison_geohash7_nbor_crosswalk.dta", keep(match) nogenerate keepusing(geohash_7 _ID geo7_nbor)
    ren _ID Shapefile_ID 

    * calculate local hours (similar logic as gen_time_vars)
    gen Stata_Date_local = StataDate_UTC 
    replace Stata_Date_local = Stata_Date_local - 1 if HourOfDay_UTC - UTC_Offset_nDST < 0

    gen Hour_local = HourOfDay_UTC - UTC_Offset_nDST
    replace Hour_local = Hour_local + 24 if HourOfDay_UTC - UTC_Offset_nDST < 0 

    * adjust for DST dates by year
    if `yr' == 2017 {
        local dst_spring = "3/12/2017"; local dst_fall = "11/5/2017"
    }
    else if `yr' == 2018 {
        local dst_spring = "3/11/2018"; local dst_fall = "11/4/2018"
    }
    else if `yr' == 2019 {
        local dst_spring = "3/10/2019"; local dst_fall = "11/3/2019"
    }
    else if `yr' == 2020 {
        local dst_spring = "3/8/2020"; local dst_fall = "11/1/2020"
    }

    replace Hour_local = Hour_local - 1 if ((Stata_Date_local== date("`dst_spring'","MDY") & Hour_local <= 2) | (Stata_Date_local <  date("`dst_spring'","MDY"))) 
    replace Hour_local = Hour_local - 1 if ((Stata_Date_local==  date("`dst_fall'","MDY") & Hour_local >= 2) | (Stata_Date_local > date("`dst_fall'","MDY")))  

    replace Stata_Date_local = Stata_Date_local - 1 if Hour_local < 0 
    replace Hour_local = Hour_local + 24 if Hour_local < 0 
    format Stata_Date_local %td

    drop StataDate_UTC HourOfDay_UTC UTC_Offset_DST UTC_Offset_nDST
    if `yr' == 2020 {
        drop if Stata_Date_local < date("1/1/2020","MDY") // drop 2019 obs
    }

    gen month = month(Stata_Date_local)

    compress

    // additional merges and labeling mirror earlier code
    if `yr' == 2020 {
        merge m:1 ID_2020 using "data/ID_2019_in_prison_post03_2020.dta"
        drop if _merge == 2
        rename _merge merge_post202003ID
    }

    ren Shapefile_ID _ID
    merge m:1 _ID using  "data/cdcr_population_w_ShapefileID.dta", ///
        keepusing(NAME ANALYTICSAMPLE) ///
         nogenerate
    ren  _ID Shapefile_ID

    label define faci_label 1 "State Prison (Incl. Med Facility)" 2 "State Prison (Work)" ///
        3 "Transitional Release Facilities" 4 "Else (Pretrial, combined jail/prison)"
    label values ANALYTICSAMPLE faci_label

    gen byte non_SP = inlist(NAME, "CUESTA CONSERVATION CAMP #24", "DEPARTMENT OF STATE HOSPITALS - ATASCADERO", ///
                "DEPARTMENT OF STATE HOSPITALS - COALINGA", "MCFARLAND FEMALE COMMUNITY REENTRY FACILITY", ///
                "FOLSOM WOMEN'S FACILITY (FWF)")
    drop if non_SP == 1

    split NAME, parse(" (")
    drop NAME2 
    replace NAME1 = proper(NAME1)
    rename NAME1 prison_name
    replace prison_name = "Central California Women's Facility" if prison_name == "Central California Women'S Facility"
    replace prison_name = "California Men's Colony" if prison_name == "California Men'S Colony"
    replace prison_name = "CA Substance Abuse Treatment Facility" if prison_name == "Ca Substance Abuse Treatment Facility"
    replace prison_name = "Pleasant Valley State Prison" if prison_name == "Pleasant Valley State Prison(Pvsp)"

    gen day_of_week = dow(Stata_Date_local)
    gen byte weekend = (day_of_week == 0 | day_of_week == 6)
    gen byte fri_sat_sun = inlist(day_of_week, 0, 5, 6)

    compress
    save "data/output/Geo7HH_in_prison_`yr'_StatePrison.dta", replace
end



***************************************************************************
* IMPLEMENTATION
***************************************************************************


forvalues yr = 2017/2020 {
	display "Processing `yr'"
    read_ping_data, yr(`yr')
    drop Apple_0_Google_1 horizontal_accuracy
    assign_outside_ping
    get_qualified_ids, yr(`yr')
    save "${prison_dir}/Pings/CA_state_prison_phones_10mins_`yr'.dta", replace
    pull_geo7_halfhours, yr(`yr')
	process_year, yr(`yr')
	get_geo7hh_in_prisons, yr(`yr')
}

